Main question to address: What areas of NYC are under-serviced by ADA accessible stations?

Plan: Somehow randomly “sample” locations in the NYC territory in (longitude, latitude) unit pairs and then calculate distance from that point to the nearest ada-certified (accessible) subway station. Normalize to nearest (accessible or not accessible) station?

Need: - Data for which stations are ADA-accessible - Data for station locations (longitude, latitude) - Some way to sample the territory of NYC (it’s an odd shape)

library(tidyverse)
library(ggmap)
library(cowplot)
library(viridis)
library(geosphere)
library(rgdal)
library(tmap)

Not sure which are the most reliable datasets to use here.

0.1 Data import, cleanup, and some exploration

ada_raw <- read_csv("./data/Elevator Escalator Station Data.csv") %>% 
  arrange(station)
subloc_raw <- read_csv("./data/NYC_Transit_Subway_Entrance_And_Exit_Data.csv") %>% 
  arrange(`Station Name`) #%>% 
  #mutate_all(str_to_lower)
set.seed(2018)
nyc311_loc <- read_csv(file = "./data/unique_nyc_2015_311.csv") %>%
  distinct() %>% 
  sample_frac(0.25, replace = FALSE)
colnames(subloc_raw) <- colnames(subloc_raw) %>% 
  str_to_lower() %>% 
  gsub(pattern = " ", replacement = "_")
colnames(ada_raw) <- colnames(ada_raw) %>% 
  str_to_lower()
colnames(nyc311_loc) <- colnames(nyc311_loc) %>% 
  str_to_lower()
glimpse(ada_raw)
Observations: 570
Variables: 7
$ station       <chr> "125 St", "125 St", "125 St", "125 St", "125 St"...
$ borough       <chr> "MN", "MN", "MN", "MN", "MN", "MN", "MN", "MN", ...
$ trainno       <chr> "A/B/C/D", "A/B/C/D", "4/5/6/METRO-NORTH", "1", ...
$ equipmentno   <chr> "EL143", "EL144", "EL126", "ES101", "ES103", "EL...
$ equipmenttype <chr> "EL", "EL", "EL", "ES", "ES", "EL", "EL", "ES", ...
$ serving       <chr> "MEZZANINE AND DOWNTOWN", "MEZZANINE AND UPTOWN"...
$ ada           <chr> "Y", "Y", "Y", "N", "N", "Y", "Y", "N", "Y", "Y"...
glimpse(subloc_raw)
Observations: 1,868
Variables: 32
$ division           <chr> "IND", "IRT", "IRT", "IRT", "IRT", "IRT", "...
$ line               <chr> "8 Avenue", "Broadway-7th Ave", "Broadway-7...
$ station_name       <chr> "103rd St", "103rd St", "103rd St", "103rd ...
$ station_latitude   <dbl> 40.79609, 40.79945, 40.79945, 40.79945, 40....
$ station_longitude  <dbl> -73.96145, -73.96838, -73.96838, -73.96838,...
$ route1             <chr> "B", "1", "1", "1", "1", "1", "7", "7", "7"...
$ route2             <chr> "C", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ route3             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route4             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route5             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route6             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route7             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route8             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route9             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route10            <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ route11            <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ entrance_type      <chr> "Stair", "Stair", "Stair", "Stair", "Stair"...
$ entry              <chr> "YES", "YES", "YES", "YES", "YES", "NO", "Y...
$ exit_only          <chr> NA, NA, NA, NA, NA, "Yes", NA, NA, NA, NA, ...
$ vending            <chr> "YES", "YES", "YES", "YES", "YES", "NO", "Y...
$ staffing           <chr> "FULL", "FULL", "FULL", "FULL", "FULL", "NO...
$ staff_hours        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ ada                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
$ ada_notes          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ free_crossover     <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T...
$ north_south_street <chr> "Central Park West", "Broadway", "Broadway"...
$ east_west_street   <chr> "103rd St", "103rd St", "103rd St", "103rd ...
$ corner             <chr> "NW", "NW", "NE", "NW", "NE", "SE", "SE", "...
$ entrance_latitude  <dbl> 40.79626, 40.79928, 40.79914, 40.79941, 40....
$ entrance_longitude <dbl> -73.96161, -73.96868, -73.96831, -73.96861,...
$ station_location   <chr> "(40.796092, -73.961454)", "(40.799446, -73...
$ entrance_location  <chr> "(40.796263, -73.961610)", "(40.799282, -73...
glimpse(nyc311_loc)
Observations: 103,510
Variables: 3
$ borough   <chr> "BROOKLYN", "QUEENS", "MANHATTAN", "BROOKLYN", "QUEE...
$ latitude  <dbl> 40.60375, 40.76693, 40.79832, 40.62297, 40.68408, 40...
$ longitude <dbl> -73.99558, -73.88903, -73.94185, -73.91094, -73.8448...

Mucking around, trying to see what I have to work with.

# What are the cols and how many missing values per col?
subloc_raw <- subloc_raw %>% 
  replace_na(
    list(
      route1 = "", route2 = "", route3 = "", route4 = "", 
      route5 = "", route6 = "", route7 = "", route8 = "", 
      route9 = "", route10 = "", route11 = ""
      )
    ) %>% 
  unite(col = routes, route1:route11, sep = "")
subloc_raw %>% 
  is.na %>% 
  as.data.frame() %>% 
  sapply(sum)
          division               line       station_name 
                 0                  0                  0 
  station_latitude  station_longitude             routes 
                 0                  0                  0 
     entrance_type              entry          exit_only 
                 0                  0               1812 
           vending           staffing        staff_hours 
                 0                  0               1828 
               ada          ada_notes     free_crossover 
                 0               1793                  0 
north_south_street   east_west_street             corner 
                29                 35                 32 
 entrance_latitude entrance_longitude   station_location 
                 0                  0                  0 
 entrance_location 
                 0 
ada_raw %>% 
  is.na %>% 
  as.data.frame() %>% 
  sapply(sum)
      station       borough       trainno   equipmentno equipmenttype 
            0             0             0             0             0 
      serving           ada 
            0             0 
# would help future steps to assign a borough to each station - can the ada_raw data be used? 
subloc_raw %>% 
  select(line:station_longitude) %>% 
  distinct() %>% 
  anti_join(ada_raw, by = c("station_name" = "station"))
# A tibble: 418 x 4
   line          station_name            station_latitude station_longitu~
   <chr>         <chr>                              <dbl>            <dbl>
 1 8 Avenue      103rd St                            40.8            -74.0
 2 Broadway-7th~ 103rd St                            40.8            -74.0
 3 Flushing      103rd St                            40.7            -73.9
 4 Lexington     103rd St                            40.8            -73.9
 5 Broadway Jam~ 104th St-102nd St                   40.7            -73.8
 6 Liberty       104th St-Oxford Av                  40.7            -73.8
 7 Lexington     110th St                            40.8            -73.9
 8 Lenox         110th St-Central Park ~             40.8            -74.0
 9 Broadway Jam~ 111th St                            40.7            -73.8
10 Flushing      111th St                            40.8            -73.9
# ... with 408 more rows
# not many station names match across the two datasets - would take some time to untangle - is there another way?
test <- subloc_raw %>% 
  select(line:station_longitude) %>% 
  distinct() %>% 
  mutate(join_key = paste(round(station_latitude, 2), round(station_longitude, 2), sep = "_")) %>% 
  left_join(nyc311_loc %>% 
              mutate(latitude = round(latitude, 2),
                     longitude = round(longitude, 2),
                     join_key = paste(latitude, longitude, sep = "_")) %>% 
              distinct(), by = "join_key")
# all of the stations can be assigned a location based on the rounded NYC 311 dataset!
subloc_bor <- subloc_raw %>% 
  mutate(join_key = paste(round(station_latitude, 2), round(station_longitude, 2), sep = "_")) %>% 
  left_join(nyc311_loc %>% 
              mutate(latitude = round(latitude, 2),
                     longitude = round(longitude, 2),
                     join_key = paste(latitude, longitude, sep = "_")) %>% 
              distinct(), by = "join_key") %>% 
  # cleanup 
  select(-join_key, -latitude, -longitude) %>% 
  select(borough, everything())
glimpse(subloc_bor)
Observations: 1,957
Variables: 23
$ borough            <chr> "MANHATTAN", "MANHATTAN", "MANHATTAN", "MAN...
$ division           <chr> "IND", "IRT", "IRT", "IRT", "IRT", "IRT", "...
$ line               <chr> "8 Avenue", "Broadway-7th Ave", "Broadway-7...
$ station_name       <chr> "103rd St", "103rd St", "103rd St", "103rd ...
$ station_latitude   <dbl> 40.79609, 40.79945, 40.79945, 40.79945, 40....
$ station_longitude  <dbl> -73.96145, -73.96838, -73.96838, -73.96838,...
$ routes             <chr> "BC", "1", "1", "1", "1", "1", "7", "7", "7...
$ entrance_type      <chr> "Stair", "Stair", "Stair", "Stair", "Stair"...
$ entry              <chr> "YES", "YES", "YES", "YES", "YES", "NO", "Y...
$ exit_only          <chr> NA, NA, NA, NA, NA, "Yes", NA, NA, NA, NA, ...
$ vending            <chr> "YES", "YES", "YES", "YES", "YES", "NO", "Y...
$ staffing           <chr> "FULL", "FULL", "FULL", "FULL", "FULL", "NO...
$ staff_hours        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ ada                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
$ ada_notes          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ free_crossover     <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T...
$ north_south_street <chr> "Central Park West", "Broadway", "Broadway"...
$ east_west_street   <chr> "103rd St", "103rd St", "103rd St", "103rd ...
$ corner             <chr> "NW", "NW", "NE", "NW", "NE", "SE", "SE", "...
$ entrance_latitude  <dbl> 40.79626, 40.79928, 40.79914, 40.79941, 40....
$ entrance_longitude <dbl> -73.96161, -73.96868, -73.96831, -73.96861,...
$ station_location   <chr> "(40.796092, -73.961454)", "(40.799446, -73...
$ entrance_location  <chr> "(40.796263, -73.961610)", "(40.799282, -73...
# after investigating, there are 27 extra rows - some of the nyc 311 locations are wrongly labeled

The Subway Entrance and Exit data seems to be good enough for an initial analysis. Hopefully the ADA column in that dataset is reliable. Need to compare this with the other datasets collected at some point, but joining is a problem.

To do: - check against Wire Monkey’s joined dataset - figure out the station name inconsistencies?

For now, stick with the sub.location.raw.data for the rest of the analysis.

Map of all subway stations in this dataset:

nyc_map <- get_map(location = "New York City")
ggmap(nyc_map) +
  geom_point(data = subloc_bor, aes(x = station_longitude, y = station_latitude, color = borough)) +
  ylim(40.495992, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.257159, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
  facet_wrap(~borough)
Warning: Removed 4 rows containing missing values (geom_rect).

#subloc_bor %>% 
#  ggplot(aes(x = station_longitude, y = station_latitude, color = borough)) +
#  geom_point() +
#  ylim(40.495992, 40.915568) +  # NYC city limits latitude coordinates
#  xlim(-74.257159, -73.699215) +  # NYC city limits longitude coordinates
#  xlab("longitude") +
#  ylab("latitude") +
#  ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
#  facet_wrap(~borough)

# there's a couple of wrong assignments - try removing them:
subloc_bor <- subloc_bor %>% 
  filter(!(borough == "MANHATTAN" & station_latitude < 40.7)) %>%
  filter(!(borough == "BRONX" & station_latitude < 40.8))
ggmap(nyc_map) +
  geom_point(data = subloc_bor, aes(x = station_longitude, y = station_latitude, color = borough)) +
  ylim(40.495992, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.257159, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
  facet_wrap(~borough)
Warning: Removed 4 rows containing missing values (geom_rect).

test <- subloc_bor %>% 
  filter(borough == "BRONX" & station_latitude < 40.8)
ggmap(nyc_map) +
  geom_point(data = test, aes(x = station_longitude, y = station_latitude)) +
  ylim(40.495992, 40.915568) +
  xlim(-74.257159, -73.699215)
Warning: Removed 1 rows containing missing values (geom_rect).

odd <- subloc_bor %>% 
  select(borough:station_name) %>% 
  distinct() %>% 
  group_by(division, line, station_name) %>% 
  count() %>% 
  filter(n > 1) %>% 
  arrange(station_name)
test2 <- subloc_bor %>% 
  select(borough:station_longitude) %>% 
  distinct() %>% 
  filter(division %in% odd$division & station_name %in% odd$station_name & line %in% odd$line) %>% 
  arrange(station_latitude, station_longitude, borough)
# doesn't matter if the two annotations are QUEENS and BROOKLYN - will be treated together, but mixups that involve BRONX and MANHATTAN would be helpful to fix
test3 <- test2 %>% 
  filter(station_latitude > 40.70442) %>% 
  arrange(station_longitude, station_latitude, borough)
test3.brnx.man <- test3 %>% 
  filter(borough == "MANHATTAN" | borough == "BROND")
test4 <- test3 %>%
  filter(station_name %in% test3.brnx.man$station_name)
test4 %>% 
  select(division, line, station_name) %>% 
  distinct()
# A tibble: 7 x 3
  division line             station_name            
  <chr>    <chr>            <chr>                   
1 IND      6 Avenue         East Broadway           
2 IND      63rd Street      Roosevelt Island        
3 IRT      Jerome           138th St                
4 IRT      Jerome           149th St-Grand Concourse
5 IRT      Pelham           138th St-3rd Ave        
6 IRT      Broadway-7th Ave 207th St                
7 IRT      Broadway-7th Ave Marble Hill-225th St    
subloc_bor <- subloc_bor %>% 
  filter(!(division == "IND" & line == "6 Avenue" & station_name == "East Broadway" & borough == "BROOKLYN")) %>% 
  filter(!(division == "IRT" & line == "Jerome" & station_name == "149th St-Grand Concourse" & borough == "MANHATTAN")) %>% 
  filter(!(division == "IRT" & line == "Pelham" & station_name == "138th St-3rd Ave" & borough == "MANHATTAN")) %>% 
  filter(!(division == "IRT" & line == "Broadway-7th Ave" & station_name == "207th St" & borough == "BRONX")) 
# The F train Roosevelt Island stop is both Queens and Manhattan - leave it as such
# The Marble Hill - 225th St stop is also an odd station - leave as both Manhattan and Bronx
ggmap(nyc_map) +
  geom_point(data = subloc_bor, aes(x = station_longitude, y = station_latitude, color = borough)) +
  ylim(40.495992, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.257159, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
  facet_wrap(~borough)
Warning: Removed 4 rows containing missing values (geom_rect).

Notes: - No stations on Staten Island - will exclude Saten Island from this point on - Most stations have more than one row (multiple entrances/exits) - need to compress info - Station names in the station column are NOT unique (multiple 103rd St, etc) in the entrance/exit data, but station location may be a unique and consistent handle to grab all the entrances/exits associated with a station - For now, will call any station that has at least one ADA-accessible entrance/exit “accessible”, but we know that some stations are only partially accessible. For some station, different subway lines are serviced by different platforms within the same station, and not all platforms are fully accessible. This is where the elevator/escalator data joining may be useful.

# some info about entrances
subloc_bor %>% 
  ggplot(aes(x = entrance_type, fill = ada)) +
  geom_bar() +
  xlab("Entrance Type")

# same scale
subloc_bor %>% 
  ggplot(aes(x = entrance_type, fill = ada)) +
  geom_bar(position = "fill") +
  xlab("Entrance Type") +
  ylab("Proportion of Entrance that is ADA-Accessible")

According to the description of the variables in this dataset, ADA == TRUE indicates that the station is ADA-Accessible, but not necessarily if that entrance is

ggmap(nyc_map) +
  geom_point(
    data = subloc_bor, 
    aes(x = station_longitude, y = station_latitude, color = ada),
    size = 1.5,
    alpha = 0.5) +
  ylim(40.55, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.1, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("All NYC Subway Stations\nIn the Subway Entrance/Exit Locations Dataset\nColor by Accessibility")
Warning: Removed 1 rows containing missing values (geom_rect).

# this data is missing the new Q-line information 

0.2 Get (long,lat) pair locations from all over NYC

Now the next questions is how do I “sample” points from the area of NYC?

I’ve worked a little bit with the NYC 311 call dataset before. Source: https://nycopendata.socrata.com/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9

Remembered that it has latitude/longitude location info for the calls. Grabbed the locations from the 2015 NYC 311 calls data - use as random sample?

nyc311_loc %>% 
  ggplot(aes(x = longitude, y = latitude)) +
  geom_bin2d(bins = 400) +
  coord_quickmap() +
  scale_fill_viridis() +
  ggtitle("Locations in the 2015 NYC 311 Calls Dataset")

Not bad. Empty areas are parks probably, should be okay to not include those.

# create unique handle using station name, division, and line columns
subloc_bor <- subloc_bor %>% 
  mutate(station_id = paste(division, line, station_name, sep = " / ")) %>% 
  select(borough, station_id, everything())
subloc_sml <- subloc_bor %>% 
  select(station_id, borough:routes, ada) %>% 
  distinct()
glimpse(subloc_sml)
Observations: 495
Variables: 9
$ station_id        <chr> "IND / 8 Avenue / 103rd St", "IRT / Broadway...
$ borough           <chr> "MANHATTAN", "MANHATTAN", "QUEENS", "MANHATT...
$ division          <chr> "IND", "IRT", "IRT", "IRT", "BMT", "IND", "I...
$ line              <chr> "8 Avenue", "Broadway-7th Ave", "Flushing", ...
$ station_name      <chr> "103rd St", "103rd St", "103rd St", "103rd S...
$ station_latitude  <dbl> 40.79609, 40.79945, 40.74986, 40.79060, 40.6...
$ station_longitude <dbl> -73.96145, -73.96838, -73.86270, -73.94748, ...
$ routes            <chr> "BC", "1", "7", "6", "JZ", "A", "6", "23", "...
$ ada               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA...
not.uni <- subloc_sml %>% 
  group_by(station_id) %>% 
  count() %>% 
  filter(n > 1)
subloc_sml_prob <- subloc_sml %>% 
  filter(station_id %in% not.uni$station_id) %>% 
  arrange(station_id)
# there's some dublicate stations still, must have had several station locations associated with those
subloc_sml <- subloc_sml %>% 
  group_by(station_id) %>% 
  mutate(avg_lat = mean(station_latitude)) %>% 
  mutate(avg_long = mean(station_longitude)) %>% 
  select(station_id:borough, avg_lat:avg_long, routes, ada) %>% 
  ungroup() %>% 
  distinct()
nrow(subloc_sml)
[1] 485
length(subloc_sml$station_id)
[1] 485
ggmap(nyc_map) +
  geom_point(data = subloc_sml, aes(x = avg_long, y = avg_lat, color = borough)) +
  ylim(40.495992, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.257159, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("All NYC Subway Stations\nin the Subway Entrance/Exit Locations Data") +
  facet_wrap(~borough)
Warning: Removed 4 rows containing missing values (geom_rect).

There’s still some duplicates in the stations that have slightly different route assignments, but similar location. I’m going to leave them in for now, not sure what to do with them.

0.3 Calculate distances between random location in NYC and the closest subway station

Small-scale test on a tiny portion of the subway stations and the 311 locations:

# small scale test
# pick 10 locations in NYC and find the distance from that point to 5 stations in the city
loc_test <- nyc311_loc %>% 
  sample_n(10, replace = FALSE)
t1 <- rbind(subloc_sml$station_id, subloc_sml$station_id)
t2 <- as.data.frame(rbind(t1, t1, t1, t1, t1), stringsAsFactors = FALSE)
colnames(t2) <- paste("stat", 1:ncol(t2), sep = "")
t3 <- t2 %>% select(1:5)
t3
                       stat1                             stat2
1  IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
2  IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
3  IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
4  IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
5  IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
6  IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
7  IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
8  IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
9  IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
10 IND / 8 Avenue / 103rd St IRT / Broadway-7th Ave / 103rd St
                       stat3                      stat4
1  IRT / Flushing / 103rd St IRT / Lexington / 103rd St
2  IRT / Flushing / 103rd St IRT / Lexington / 103rd St
3  IRT / Flushing / 103rd St IRT / Lexington / 103rd St
4  IRT / Flushing / 103rd St IRT / Lexington / 103rd St
5  IRT / Flushing / 103rd St IRT / Lexington / 103rd St
6  IRT / Flushing / 103rd St IRT / Lexington / 103rd St
7  IRT / Flushing / 103rd St IRT / Lexington / 103rd St
8  IRT / Flushing / 103rd St IRT / Lexington / 103rd St
9  IRT / Flushing / 103rd St IRT / Lexington / 103rd St
10 IRT / Flushing / 103rd St IRT / Lexington / 103rd St
                                        stat5
1  BMT / Broadway Jamaica / 104th St-102nd St
2  BMT / Broadway Jamaica / 104th St-102nd St
3  BMT / Broadway Jamaica / 104th St-102nd St
4  BMT / Broadway Jamaica / 104th St-102nd St
5  BMT / Broadway Jamaica / 104th St-102nd St
6  BMT / Broadway Jamaica / 104th St-102nd St
7  BMT / Broadway Jamaica / 104th St-102nd St
8  BMT / Broadway Jamaica / 104th St-102nd St
9  BMT / Broadway Jamaica / 104th St-102nd St
10 BMT / Broadway Jamaica / 104th St-102nd St
dist_test <- bind_cols(loc_test, t3) %>%
  gather(key = "station.num", value = "station_id", 4:8) %>% 
  arrange(station.num)
# each of the 10 (long,lat) points is now paired with each of the 5 stations selected
dist_test
# A tibble: 50 x 5
   borough   latitude longitude station.num station_id               
   <chr>        <dbl>     <dbl> <chr>       <chr>                    
 1 BRONX         40.9     -73.9 stat1       IND / 8 Avenue / 103rd St
 2 BROOKLYN      40.6     -73.9 stat1       IND / 8 Avenue / 103rd St
 3 MANHATTAN     40.9     -73.9 stat1       IND / 8 Avenue / 103rd St
 4 BROOKLYN      40.6     -74.0 stat1       IND / 8 Avenue / 103rd St
 5 QUEENS        40.8     -73.9 stat1       IND / 8 Avenue / 103rd St
 6 QUEENS        40.6     -73.9 stat1       IND / 8 Avenue / 103rd St
 7 QUEENS        40.8     -73.9 stat1       IND / 8 Avenue / 103rd St
 8 BROOKLYN      40.6     -73.9 stat1       IND / 8 Avenue / 103rd St
 9 QUEENS        40.7     -73.8 stat1       IND / 8 Avenue / 103rd St
10 QUEENS        40.7     -73.9 stat1       IND / 8 Avenue / 103rd St
# ... with 40 more rows
# link the station.id to a (long,lat) pair for that station: 
dist_test_w_stat_loc <- dist_test %>% 
  inner_join(subloc_sml, by = "station_id")
# now calculate distance between point and the station using the geosphere package:
dist.test.res <- dist_test_w_stat_loc  %>% 
  mutate(
    dist.to.stat = distGeo(
      # p1 = NYC 311 location
      p1 = as.matrix(dist_test_w_stat_loc %>% select(longitude, latitude)), 
      # p2 = subway station location
      p2 = as.matrix(dist_test_w_stat_loc  %>% select(avg_long, avg_lat))
      ) * 0.000621371  # convert meters to miles
    ) %>% 
  mutate(loc.id = paste(borough.y, latitude, longitude, sep = " / "))

Note: the distances are based on direct distance from point to point

To do: - Is there some way to incorporate Manhattan Distance into the calculation to get a more accurate distance measurement?

What’s the distance from the NYC 311 point to the closest station (out of the 5):

dim(dist.test.res)
[1] 50 12
dist.test.nearest <- dist.test.res %>% 
  group_by(loc.id) %>% 
  summarize(nearest.stat = min(dist.to.stat)) %>% 
  ungroup() %>% 
  inner_join((dist.test.res %>% select(longitude, latitude, loc.id)), by = "loc.id") %>% 
  unique()
dim(dist.test.nearest)
[1] 20  4
ggmap(nyc_map) +
  geom_point(data = dist.test.nearest, aes(x = longitude, y = latitude, color = nearest.stat), size = 3) +
  scale_color_viridis(option = "B")  +
  ylim(40.55, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.1, -73.699215) +  # NYC city limits longitude coordinates
  xlab("Longitude") +
  ylab("Latitude") +
  ggtitle("Direct distance to nearest subway station from point in NYC")
Warning: Removed 1 rows containing missing values (geom_rect).

Test run with a small fraction of the data seems to have worked, now to tackle the full data.

0.4 Full scale calculation for all points in the 311 calls dataset and all subway stations

0.4.1 Run the following chunk at your own risk. R took up >40GB of RAM at some points for me.

(It’s currently set to not evaluate)

I exported the distance to any nearest subway station to any point and the distance to the nearest ada subway station into csv files. They’re imported in the next chunk.

glimpse(nyc311_loc)
Observations: 103,510
Variables: 3
$ borough   <chr> "BROOKLYN", "QUEENS", "MANHATTAN", "BROOKLYN", "QUEE...
$ latitude  <dbl> 40.60375, 40.76693, 40.79832, 40.62297, 40.68408, 40...
$ longitude <dbl> -73.99558, -73.88903, -73.94185, -73.91094, -73.8448...
glimpse(subloc_sml)
Observations: 485
Variables: 6
$ station_id <chr> "IND / 8 Avenue / 103rd St", "IRT / Broadway-7th Av...
$ borough    <chr> "MANHATTAN", "MANHATTAN", "QUEENS", "MANHATTAN", "Q...
$ avg_lat    <dbl> 40.79609, 40.79945, 40.74986, 40.79060, 40.69518, 4...
$ avg_long   <dbl> -73.96145, -73.96838, -73.86270, -73.94748, -73.844...
$ routes     <chr> "BC", "1", "7", "6", "JZ", "A", "6", "23", "J", "7"...
$ ada        <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA...
nyc311_loc <- nyc311_loc %>% 
  mutate(
    borough = ifelse(
      borough == "MANHATTAN", "MANHATTAN",
      ifelse(
        borough == "BRONX", "BRONX", 
        ifelse(
          borough == "BROOKLYN", "BROOKLYN_QUEENS", "BROOKLYN_QUEENS"
          )
        )
      )
    )
subloc_sml <- subloc_sml %>% 
  mutate(
    borough = ifelse(
      borough == "MANHATTAN", "MANHATTAN",
      ifelse(
        borough == "BRONX", "BRONX", 
        ifelse(
          borough == "BROOKLYN", "BROOKLYN_QUEENS", "BROOKLYN_QUEENS"
          )
        )
      )
    )
set.seed(2001)
test <- nyc311_loc %>% 
  sample_frac(0.001, replace = FALSE) %>% 
  full_join(subloc_sml, by = "borough") 
distances <- test %>% 
  mutate(
    dist.to.stat = distGeo(
      # p1 = NYC 311 location
      p1 = as.matrix(test %>% select(longitude, latitude)), 
      # p2 = subway station location
      p2 = as.matrix(test %>% select(avg_long, avg_lat))
      ) * 0.000621371  # convert meters to miles
    ) %>% 
  mutate(loc_id = paste(borough, latitude, longitude, sep = " / "))
min.distance <- distances %>% 
  group_by(loc_id) %>% 
  summarize(nearest.stat = min(dist.to.stat)) %>% 
  ungroup() %>% 
  inner_join((distances %>% select(longitude, latitude, loc_id)), by = "loc_id") %>% 
  unique()
nyc_loc_full_join <- nyc311_loc %>% 
  full_join(subloc_sml, by = "borough") 
p1 <- as.matrix(nyc_loc_full_join %>% select(longitude, latitude))
p2 <- as.matrix(nyc_loc_full_join %>% select(avg_long, avg_lat))
dist.to.stat <- distGeo(
      # p1 = NYC 311 location
      p1 = p1, 
      # p2 = subway station location
      p2 = p2
      ) * 0.000621371
nyc_loc_full_join$dist.to.stat <- dist.to.stat
nyc_loc_full_join$loc_id <- paste(nyc_loc_full_join$borough, nyc_loc_full_join$latitude, nyc_loc_full_join$longitude, sep = " / ")

nyc_loc_full_join %>% 
  summarize(
    mean.dist = mean(dist.to.stat),
    med.dist = median(dist.to.stat)
    )
# A tibble: 1 x 2
  mean.dist med.dist
      <dbl>    <dbl>
1      6.09     5.61
nyc_loc_full_join %>% 
  ggplot(aes(dist.to.stat)) +
  geom_histogram(bins = 100) +
  geom_vline(xintercept = mean(dist.to.stat), color = "#E69F00", size = 2, alpha = 0.8) +
  geom_vline(xintercept = median(dist.to.stat), color = "#56B4E9", size = 2, alpha = 0.8) +
  ggtitle("Histogram of distances to all stations") +
  xlab("Distance from point in NYC to subway station\nWithin burrough") 

nyc_loc_full_join %>% 
  ggplot(aes(sqrt(dist.to.stat))) +
  geom_histogram(bins = 100) +
  geom_vline(xintercept = mean(sqrt(dist.to.stat)), color = "#E69F00", size = 2, alpha = 0.8) +
  geom_vline(xintercept = median(sqrt(dist.to.stat)), color = "#56B4E9", size = 2, alpha = 0.8) +
  ggtitle("Histogram of square root of distances to all stations") +
  xlab("Square Root of distance from point in NYC to subway station\nWithin burrough") 

nyc_loc_full_join %>%
  ggplot(aes(dist.to.stat)) +
  geom_histogram(bins = 100) +
  geom_vline(xintercept = mean(dist.to.stat), color = "#E69F00", size = 2, alpha = 0.8) +
  geom_vline(xintercept = median(dist.to.stat), color = "#56B4E9", size = 2, alpha = 0.8) +  
  ggtitle("Histogram of distances to ADA-Accessible stations only") +
  xlab("Distance from point in NYC to ADA-accessible subway station\nWithin burrough") +
  facet_wrap(~ada, nrow = 1)

all.distance <- nyc_loc_full_join %>% 
  group_by(loc_id) %>% 
  summarize(
    nearest.stat = min(dist.to.stat),
    mean.dist = mean(dist.to.stat),
    med.dist = median(dist.to.stat)
    ) %>% 
  ungroup() %>% 
  inner_join(nyc_loc_full_join %>% select(loc_id, borough:longitude), by = "loc_id") %>% 
  distinct()
all.distance %>% 
  select(loc_id:med.dist) %>% 
  gather(key = "Dist.type", value = "Distance", -loc_id) %>% 
  ggplot(aes(Distance)) +
  geom_histogram(bins = 50) +
  facet_wrap(~ Dist.type, nrow = 2)

all.distance %>% 
  select(loc_id, mean.dist:med.dist) %>% 
  gather(key = "Dist.type", value = "Distance", -loc_id) %>% 
  ggplot(aes(Distance)) +
  geom_histogram(bins = 50) +
  facet_wrap(~ Dist.type, nrow = 1)

all.distance %>% 
  select(loc_id, borough, mean.dist:med.dist) %>% 
  gather(key = "Dist.type", value = "Distance", mean.dist:med.dist) %>% 
  ggplot(aes(Distance)) +
  geom_histogram(bins = 50) +
  facet_wrap(Dist.type ~ borough)

ada.distance <- nyc_loc_full_join %>% 
  filter(ada) %>% 
  group_by(loc_id) %>% 
  summarize(
    nearest.ada.stat = min(dist.to.stat),
    mean.ada.stat = mean(dist.to.stat),
    med.ada.stat = median(dist.to.stat)
    ) %>% 
  ungroup() %>% 
  inner_join((nyc_loc_full_join %>% select(loc_id, borough:longitude)), by = "loc_id") %>% 
  distinct()
ada.distance %>% 
  select(loc_id:med.ada.stat) %>% 
  gather(key = "Dist.type", value = "Distance", -loc_id) %>% 
  ggplot(aes(Distance)) +
  geom_histogram(bins = 50) +
  facet_wrap(~ Dist.type, nrow = 2)

ada.distance %>% 
  select(loc_id, mean.ada.stat:med.ada.stat) %>% 
  gather(key = "Dist.type", value = "Distance", -loc_id) %>% 
  ggplot(aes(Distance)) +
  geom_histogram(bins = 50) +
  facet_wrap(~ Dist.type, nrow = 1)

ada.distance %>% 
  select(loc_id, borough, mean.ada.stat:med.ada.stat) %>% 
  gather(key = "Dist.type", value = "Distance", mean.ada.stat:med.ada.stat) %>% 
  ggplot(aes(Distance)) +
  geom_histogram(bins = 50) +
  facet_wrap(Dist.type ~ borough)

rm(nyc_loc_full_join, p1, p2, dist.to.stat)
gc()
          used  (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells 2651989 141.7    8046515  429.8   8046515  429.8
Vcells 9672844  73.8  774204892 5906.8 967756115 7383.4

0.4.1.1 Plots of the results

Distance to any type of station from a particular point in NYC:

all.distance %>% 
  ggplot(aes(x = longitude, y = latitude, color = nearest.stat)) +
  geom_point(size = 0.2, alpha = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey50')) +
  coord_quickmap() +
  ggtitle("Direct distance to any nearest subway station from point in NYC")

all.distance %>% 
  ggplot(aes(x = longitude, y = latitude, color = mean.dist)) +
  geom_point(size = 0.2, alpha = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey50')) +
  coord_quickmap() +
  ggtitle("Direct distance to any nearest subway station from point in NYC")

ada.distance %>% 
  ggplot(aes(x = longitude, y = latitude, color = nearest.ada.stat)) +
  geom_point(size = 0.25, alpha = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey50')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-Accessible subway station from point in NYC")

ada.distance %>% 
  ggplot(aes(x = longitude, y = latitude, color = mean.ada.stat)) +
  geom_point(size = 0.25, alpha = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey50')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-Accessible subway station from point in NYC")

ggmap(nyc_map) +
  geom_point(data = all.distance, aes(x = longitude + 0.002, y = latitude - 0.002, color = mean.dist), size = 0.4, alpha = 0.5) +
  scale_color_viridis(option = "B")  +
  ylim(40.55, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.1, -73.699215) +  # NYC city limits longitude coordinates
  xlab("longitude") +
  ylab("latitude") +
  ggtitle("Direct distance to any nearest subway station from point in NYC")
Warning: Removed 1 rows containing missing values (geom_rect).
Warning: Removed 8 rows containing missing values (geom_point).

Unsurprisingly, some areas are just far from any subway stations and that carries over into what areas are far from ADA stations.

dist.merge <- all.distance %>% 
  inner_join(ada.distance %>% select(-(borough:longitude)), by = "loc_id") %>% 
  select(loc_id, borough:longitude, everything())
glimpse(dist.merge)
Observations: 103,506
Variables: 10
$ loc_id           <chr> "BRONX / 40.7560423362848 / -73.9257889541193...
$ borough          <chr> "BRONX", "BRONX", "BRONX", "BRONX", "BRONX", ...
$ latitude         <dbl> 40.75604, 40.79847, 40.79871, 40.79892, 40.79...
$ longitude        <dbl> -73.92579, -73.91072, -73.91114, -73.91147, -...
$ nearest.stat     <dbl> 3.4589419, 0.5070450, 0.4838138, 0.4651185, 0...
$ mean.dist        <dbl> 6.857871, 3.931517, 3.922247, 3.914472, 3.883...
$ med.dist         <dbl> 6.853549, 3.994368, 3.990962, 3.987878, 3.945...
$ nearest.ada.stat <dbl> 4.1661391, 1.2720243, 1.2495533, 1.2312540, 1...
$ mean.ada.stat    <dbl> 6.805051, 3.901289, 3.889184, 3.879117, 3.854...
$ med.ada.stat     <dbl> 7.480305, 4.468260, 4.454069, 4.442268, 4.417...
# sanity checks for joining:
sapply(all.distance, anyNA)
      loc_id nearest.stat    mean.dist     med.dist      borough 
       FALSE        FALSE        FALSE        FALSE        FALSE 
    latitude    longitude 
       FALSE        FALSE 
sapply(ada.distance, anyNA)
          loc_id nearest.ada.stat    mean.ada.stat     med.ada.stat 
           FALSE            FALSE            FALSE            FALSE 
         borough         latitude        longitude 
           FALSE            FALSE            FALSE 
sapply(dist.merge, anyNA)
          loc_id          borough         latitude        longitude 
           FALSE            FALSE            FALSE            FALSE 
    nearest.stat        mean.dist         med.dist nearest.ada.stat 
           FALSE            FALSE            FALSE            FALSE 
   mean.ada.stat     med.ada.stat 
           FALSE            FALSE 
# joining seems ok
dist.merge <- dist.merge %>% 
  mutate(
    # normalize dist to nearest station by subtraction
    any.vs.ada.min.dist = nearest.ada.stat - nearest.stat,
    # normalize dist to nearest station by division
    any.vs.ada.min.ratio = nearest.ada.stat / nearest.stat,
    # repeat for means
    any.vs.ada.mean.dist = mean.ada.stat - mean.dist,
    any.vs.ada.mean.ratio = mean.ada.stat / mean.dist,
    # repeat for med
    any.vs.ada.med.dist = med.ada.stat - med.dist,
    any.vs.ada.med.ratio = med.ada.stat / med.dist
    ) %>% 
  select(loc_id:longitude, starts_with("any.vs.ada"))
glimpse(dist.merge)
Observations: 103,506
Variables: 10
$ loc_id                <chr> "BRONX / 40.7560423362848 / -73.92578895...
$ borough               <chr> "BRONX", "BRONX", "BRONX", "BRONX", "BRO...
$ latitude              <dbl> 40.75604, 40.79847, 40.79871, 40.79892, ...
$ longitude             <dbl> -73.92579, -73.91072, -73.91114, -73.911...
$ any.vs.ada.min.dist   <dbl> 0.7071972, 0.7649793, 0.7657395, 0.76613...
$ any.vs.ada.min.ratio  <dbl> 1.204455, 2.508701, 2.582715, 2.647183, ...
$ any.vs.ada.mean.dist  <dbl> -0.05282031, -0.03022794, -0.03306252, -...
$ any.vs.ada.mean.ratio <dbl> 0.9922979, 0.9923114, 0.9915705, 0.99096...
$ any.vs.ada.med.dist   <dbl> 0.6267557, 0.4738921, 0.4631062, 0.45438...
$ any.vs.ada.med.ratio  <dbl> 1.091450, 1.118640, 1.116039, 1.113943, ...

Histogram of the distances:

dist.merge %>% 
  select(loc_id:borough, ends_with(".dist")) %>% 
  rename(
    "Minimum_Diff" = any.vs.ada.min.dist,
    "Mean_Diff" = any.vs.ada.mean.dist,
    "Med_Diff" = any.vs.ada.med.dist) %>% 
  gather(key = "Dist.type", value = "Norm_Distance", -(loc_id:borough)) %>% 
  ggplot(aes(x = Norm_Distance)) +
  geom_histogram(bins = 100) +
  facet_wrap(~Dist.type) +
  xlab("Normalized Distance (miles)") +
  ggtitle("Difference in distance\nBetween nearest ADA-Accessible subway station and any subway station")

dist.merge %>% 
  ggplot(aes(any.vs.ada.min.dist)) +
  geom_histogram(bins = 100)

dist.merge %>% 
  ggplot(aes(any.vs.ada.med.dist)) +
  geom_histogram(bins = 100)

dist.merge %>% 
  select(loc_id:borough, ends_with(".ratio")) %>% 
  rename(
    "Minimum_Ratio" = any.vs.ada.min.ratio,
    "Mean_Ratio" = any.vs.ada.mean.ratio,
    "Med_Ratio" = any.vs.ada.med.ratio
  ) %>% 
  gather(key = "Dist.type", value = "Norm_Distance", -(loc_id:borough)) %>% 
  ggplot(aes(x = Norm_Distance)) +
  geom_histogram(bins = 100) +
  facet_wrap(~Dist.type) +
  xlab("Ratio") +
  ggtitle("Ratio in distance\nBetween nearest ADA-Accessible subway station and any subway station")

dist.merge %>% 
  ggplot(aes(any.vs.ada.mean.ratio)) +
  geom_histogram(bins = 100)

dist.merge %>% 
  ggplot(aes(any.vs.ada.med.ratio)) +
  geom_histogram(bins = 100)

Maps of the normalized distances to ADA-Accessible stations:

### subtraction normalization
dist.merge %>% 
  ggplot(aes(x = longitude, y = latitude, color = any.vs.ada.min.dist)) +
  geom_point(size = 0.1) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey50')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC\nNormalized to distance to nearest any kind of subway station (subtraction)") +
  xlab("Longitude") +
  ylab("Latitude")

dist.merge %>% 
  ggplot(aes(x = longitude, y = latitude, color = any.vs.ada.min.dist)) +
  geom_point(size = 0.2, alpha = 0.3) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey50')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC\nNormalized to distance to nearest any kind of subway station (subtraction)") +
  xlab("Longitude") +
  ylab("Latitude")

dist.merge %>% 
  ggplot(aes(x = longitude, y = latitude, color = any.vs.ada.med.ratio)) +
  geom_point(size = 0.2, alpha = 0.3) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey50')) +
  coord_quickmap()

What are the rough locations under-serviced? Google map can help here, even though it’s not perfect.

ggmap(nyc_map) +
  geom_point(data = dist.merge, aes(x = longitude + 0.002, y = latitude - 0.002, color = any.vs.ada.min.dist), size = 0.2) +
  scale_color_viridis(option = "B")  +
  ylim(40.55, 40.915568) +  # NYC city limits latitude coordinates
  xlim(-74.1, -73.699215) +  # NYC city limits longitude coordinates
  xlab("Longitude") +
  ylab("Latitude") +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC\nNormalized to distance to nearest any kind of subway station (subtraction)")
Warning: Removed 1 rows containing missing values (geom_rect).
Warning: Removed 8 rows containing missing values (geom_point).

The poor overlay is fairly annoying.

What about the ratio norm?

### ratio normalization
dist.combo %>% 
  filter(any.vs.ada.ratio < 100) %>%  # this number was trial and error
  ggplot(aes(x = longitude, y = latitude, color = any.vs.ada.ratio)) +
  geom_point(size = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey50')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC\nNormalized by taking the ratio to the nearest any kind of subway station")
# I was right, looks like its points right on top of stations that are really "far" from ada-accessible stations

dist.combo %>% 
  filter(any.vs.ada.ratio > 10) %>%  # this number was trial and error
  mutate(any.vs.ada.ratio = 10) %>% 
  bind_rows(
    dist.combo %>% 
      filter(any.vs.ada.ratio < 10)
    ) %>% 
  ggplot(aes(x = longitude, y = latitude, color = any.vs.ada.ratio)) +
  geom_point(size = 0.2) +
  scale_color_viridis(option = "B") +
  theme(panel.background = element_rect(fill = 'grey50')) +
  coord_quickmap() +
  ggtitle("Direct distance to nearest ADA-accessible subway station from point in NYC\nNormalized by taking the ratio to the nearest any kind of subway station")

The ratio normalization doesn’t really work so well, the subtraction is probably the better choice in this case.

Next steps: - Separate distance calculations by borough for both the random point and the station, in case “closest” stations are across a river. Probably can keep Queens and Brooklyn together? - Incorporate outage data to find the “true” dead zones? Or areas that are usually suffering from long-term outages of accessibility equipment? - Find a better NYC map overlay - is it the 311 long,lat data that’s the problem, or the map I got from ggmap?

test.map <- readOGR("./data/nyct2010_18a", "nyct2010")
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\Dasha\Documents\GitHub\subway_accessibility_map\data\nyct2010_18a", layer: "nyct2010"
## with 2166 features
## It has 11 fields
tm_shape(test.map) +
  tm_borders() 

test.map2 <- fortify(test.map)
ggplot(test.map2, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = NA, color = "black")

test.map3 <- test.map[test.map$BoroName != "Staten Island", ]
tm_shape(test.map3) +
  tm_borders() 

dist.points <- SpatialPoints(dist.merge[, c(4, 3)])
proj4string(dist.points) <- CRS("+init=epsg:4326")
#proj4string(dist.points) <- CRS("+proj=lcc +lat_1=41.03333333333333 +lat_2=40.66666666666666 +lat_0=40.16666666666666 +lon_0=-74 +x_0=300000.0000000001 +y_0=0 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 +no_defs")
plot(dist.points)

dist.points <- spTransform(dist.points, proj4string(test.map3))
dist.by.tract <- over(dist.points, test.map3)
dist.by.tract.frt <- fortify(dist.by.tract)
test4 <- dist.merge %>% 
  bind_cols(dist.by.tract.frt)
test5 <- test4 %>% 
  group_by(CT2010) %>% 
  summarize(avg_to_ada_stat = mean(any.vs.ada.med.dist))
test6 <- test4 %>% 
  group_by(NTAName) %>% 
  summarize(avg_to_ada_stat = mean(any.vs.ada.med.dist)) %>% 
  arrange(desc(avg_to_ada_stat))
merge.to.map <- merge(test.map3, test5, by.x = "CT2010", by.y = "CT2010")
merge.to.map2 <- merge.to.map
merge.to.map2[str_detect(merge.to.map2$NTAName, "park-cemetery-etc") | str_detect(merge.to.map2$NTAName, "Airport"), ] <- NA
tm_shape(merge.to.map2) +
  tm_fill("avg_to_ada_stat", midpoint = NA, palette = viridis(n = 6))

merge.to.map.neigh <- merge(test.map3, test6, by.x = "NTAName", by.y = "NTAName")
merge.to.map.neigh[str_detect(merge.to.map.neigh$NTAName, "park-cemetery-etc") | str_detect(merge.to.map.neigh$NTAName, "Airport"), ] <- NA
tm_shape(merge.to.map.neigh) +
  tm_borders()+
  tm_fill("avg_to_ada_stat", midpoint = NA, palette = viridis(n = 6))

sum(str_detect(merge.to.map$NTAName, "park-cemetery-etc"))
## [1] 39
merge.parks <- merge.to.map[str_detect(merge.to.map$NTAName, "park-cemetery-etc"),]
tm_shape(merge.parks) +
  tm_fill()